library(readr)
framingham <- read_csv("framingham.csv")
## Rows: 4240 Columns: 16
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (16): male, age, education, currentSmoker, cigsPerDay, BPMeds, prevalent...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
#remove missing value
framingham <- na.omit(framingham)
# Check the structure of the dataframe
str(framingham)
## tibble [3,658 × 16] (S3: tbl_df/tbl/data.frame)
## $ male : num [1:3658] 1 0 1 0 0 0 0 0 1 1 ...
## $ age : num [1:3658] 39 46 48 61 46 43 63 45 52 43 ...
## $ education : num [1:3658] 4 2 1 3 3 2 1 2 1 1 ...
## $ currentSmoker : num [1:3658] 0 0 1 1 1 0 0 1 0 1 ...
## $ cigsPerDay : num [1:3658] 0 0 20 30 23 0 0 20 0 30 ...
## $ BPMeds : num [1:3658] 0 0 0 0 0 0 0 0 0 0 ...
## $ prevalentStroke: num [1:3658] 0 0 0 0 0 0 0 0 0 0 ...
## $ prevalentHyp : num [1:3658] 0 0 0 1 0 1 0 0 1 1 ...
## $ diabetes : num [1:3658] 0 0 0 0 0 0 0 0 0 0 ...
## $ totChol : num [1:3658] 195 250 245 225 285 228 205 313 260 225 ...
## $ sysBP : num [1:3658] 106 121 128 150 130 ...
## $ diaBP : num [1:3658] 70 81 80 95 84 110 71 71 89 107 ...
## $ BMI : num [1:3658] 27 28.7 25.3 28.6 23.1 ...
## $ heartRate : num [1:3658] 80 95 75 65 85 77 60 79 76 93 ...
## $ glucose : num [1:3658] 77 76 70 103 85 99 85 78 79 88 ...
## $ TenYearCHD : num [1:3658] 0 0 0 1 0 0 1 0 0 0 ...
## - attr(*, "na.action")= 'omit' Named int [1:582] 15 22 27 34 37 43 50 55 71 73 ...
## ..- attr(*, "names")= chr [1:582] "15" "22" "27" "34" ...
The variables are as follows:
*sex : the gender of the observations. The variable is a binary named “male” in the dataset.
*age : Age at the time of medical examination in years. (continuous)
*education : A categorical variable of the participants education, with the levels: Some high school (1), high school/GED (2), some college/vocational school (3), college (4) (categorical)
*currentSmoker: Current cigarette smoking at the time of examinations (binary)
*cigsPerDay: Number of cigarettes smoked each day
*BPmeds: whether or not the patient was on blood pressure medication (binary)
*prevalentStroke: whether or not the patient had previously had a stroke (binary)
*prevalentHyp: whether or not the patient was hypertensive (binary)
*diabetes: whether or not the patient had diabetes (binary)
*totChol: Total cholesterol (mg/dL) (continuous)
*sysBP: Systolic Blood Pressure (mmHg) (continuous)
*diaBP: Diastolic blood pressure (mmHg) (continuous)
*BMI: Body Mass Index, weight (kg)/height (m)^2 (continuous)
*heartRate: Heart rate (beats/minute) (continuous)
*glucose: Blood glucose level (mg/dL) (continuous)
*TenYearCHD: The 10 year risk of coronary heart disease(CHD). (binary: “1”, means “Yes”, “0” means “No”)
\(\bf{Pearson \space correlation (r)}\), which measures a linear dependence between two variables (x and y). It’s also known as a parametric correlation test because it depends to the distribution of the data. It can be used only when x and y are from normal distribution.
\(\bf{Kendall \space \tau}\) and \(\bf{Spearman \space \rho}\), which are rank-based correlation coefficients (non-parametric).
\(\bf{Kendall’s \space \tau}\): usually smaller values than \(\bf{Spearman \space \rho}\) correlation. Calculations based on concordant and discordant pairs. Insensitive to error. p values are more accurate with smaller sample sizes.
\(\bf{Spearman \space \rho}\): usually have larger values than \(\bf{Kendall’s \space \tau}\). Calculations based on deviations. Much more sensitive to error and discrepancies in data.
\(\bf{-1}\) indicates a strong negative correlation : this means that every time x increases, y decreases.
\(\bf{0}\) means that there is no association between the two variables (x and y).
\(\bf{1}\) indicates a strong positive correlation : this means that y increases with x.
\(\bf{Note \space for \space cor.test():}\)
Since the spearman correlation coefficient considers the rank of values, the correlation test ignores the same ranks to find the p-values as a result we get the warning “Cannot compute exact p-value with ties”. This can be avoided by using exact = FALSE inside the cor.test() function. The value t,z,S represents the test statistic for corresponding correlation test. These test statistics are used to determine the significance of the observed correlation.
#Pearson,Spearman, Kendall
#check assumption
#Shapiro-Wilk normality test
#Null hypothesis: the variable comes from a normal distribution
#Alternative hypothesis: the variable doesn't come from a normal distribution
shapiro.test(framingham$sysBP) # => p < 2.2e-16
##
## Shapiro-Wilk normality test
##
## data: framingham$sysBP
## W = 0.93407, p-value < 2.2e-16
shapiro.test(framingham$diaBP) # => p < 2.2e-16
##
## Shapiro-Wilk normality test
##
## data: framingham$diaBP
## W = 0.97397, p-value < 2.2e-16
#Conclusion: Both sysBP and diaBP are not normal distributed.
#QQ plot
qqnorm(framingham$sysBP, pch = 1, frame = FALSE)
qqline(framingham$sysBP, col = "steelblue", lwd = 2)
qqnorm(framingham$diaBP, pch = 1, frame = FALSE)
qqline(framingham$diaBP, col = "steelblue", lwd = 2)
#correlation coefficient
cor(framingham$sysBP, framingham$diaBP, method = "pearson")
## [1] 0.7866694
cor(framingham$sysBP, framingham$diaBP, method = "kendall")
## [1] 0.599936
cor(framingham$sysBP, framingham$diaBP, method = "spearman")
## [1] 0.7803225
#correlation test
#Null hypothesis: The correlation coefficient is equal to 0
#Alternative hypothesis: The correlation coefficient is not equal to 0
cor.test(framingham$sysBP, framingham$diaBP, method = c("pearson"),conf.level = 0.90)
##
## Pearson's product-moment correlation
##
## data: framingham$sysBP and framingham$diaBP
## t = 77.045, df = 3656, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 90 percent confidence interval:
## 0.7760752 0.7968197
## sample estimates:
## cor
## 0.7866694
cor.test(framingham$sysBP, framingham$diaBP, method = c("kendall"),alternative = "less")
##
## Kendall's rank correlation tau
##
## data: framingham$sysBP and framingham$diaBP
## z = 53.574, p-value = 1
## alternative hypothesis: true tau is less than 0
## sample estimates:
## tau
## 0.599936
cor.test(framingham$sysBP, framingham$diaBP, method = c("spearman"),exact = FALSE)
##
## Spearman's rank correlation rho
##
## data: framingham$sysBP and framingham$diaBP
## S = 1792113351, p-value < 2.2e-16
## alternative hypothesis: true rho is not equal to 0
## sample estimates:
## rho
## 0.7803225
#Visulization, heatmap
plot(framingham$sysBP, framingham$diaBP)
library("ggpubr")
## Loading required package: ggplot2
ggscatter(framingham, x = "sysBP", y = "diaBP",
add = "reg.line", conf.int = TRUE,
cor.coef = TRUE, cor.method = "spearman",
xlab = "sysBP", ylab = "diaBP")
#correlation matrix for multiple variables from dataset
Cor.framingham = cor(framingham[,c("age","cigsPerDay","totChol","sysBP","diaBP","BMI","heartRate","glucose")])
#view(Cor.framingham)
library(corrplot)
## corrplot 0.92 loaded
corrplot(Cor.framingham, type = "upper", order = "hclust",
tl.col = "black", tl.srt = 45)
\(\bf{Exercise1:}\) Is there significant correlation between “cigsPerDay” and “BMI”?
#check assumption
shapiro.test(framingham$cigsPerDay) # => p < 2.2e-16
##
## Shapiro-Wilk normality test
##
## data: framingham$cigsPerDay
## W = 0.76257, p-value < 2.2e-16
shapiro.test(framingham$BMI) # => p < 2.2e-16
##
## Shapiro-Wilk normality test
##
## data: framingham$BMI
## W = 0.95655, p-value < 2.2e-16
#QQ plot
qqnorm(framingham$cigsPerDay, pch = 1, frame = FALSE)
qqline(framingham$cigsPerDay, col = "steelblue", lwd = 2)
qqnorm(framingham$BMI, pch = 1, frame = FALSE)
qqline(framingham$BMI, col = "steelblue", lwd = 2)
cor.test(framingham$cigsPerDay, framingham$BMI, method = c("spearman"),exact = FALSE)
##
## Spearman's rank correlation rho
##
## data: framingham$cigsPerDay and framingham$BMI
## S = 9239929695, p-value = 7.971e-16
## alternative hypothesis: true rho is not equal to 0
## sample estimates:
## rho
## -0.132632
PCA is used in exploratory data analysis.
PCA commonly used for \(\bf{dimensionality \space reduction}\) by using each data point onto only the first few principal components (most cases first and second dimensions) to obtain lower-dimensional data while keeping as much of the data’s variation as possible.
\(\bf{The \space first \space principal \space component}\) can equivalently be defined as \(a \space direction \space that \space maximizes \space the \space variance \space of \space the \space projected \space data.\)
The principal components are often analyzed by eigen-decomposition of the data covariance matrix or singular value decomposition (SVD) of the data matrix.
\(\bf{Question:}\) We are interested in the Heartdisease classification. (Set the \(\bf{"TenYearCHD"}\) as dependent/response/Y variable.)
Set \(\bf{"age","cigsPerDay","totChol","sysBP","diaBP","BMI","heartRate","glucose"}\) as independent/exploratory/X variables.
framingham$TenYearCHD = factor(framingham$TenYearCHD)
library(psych)
##
## Attaching package: 'psych'
## The following objects are masked from 'package:ggplot2':
##
## %+%, alpha
pairs.panels(framingham[,c(2,5,10,11,12,13,14,15)],
gap=0,
bg = c("red", "blue")[framingham$TenYearCHD],
pch=21)
#prcomp() from package "stats"
pc <- prcomp(framingham[,c(2,5,10,11,12,13,14,15)],
center = TRUE, #variables should be shifted to be zero centered
scale = TRUE) #the variables should be scaled to have unit variance before the analysis takes place.
print(pc)
## Standard deviations (1, .., p=8):
## [1] 1.5543466 1.0778875 1.0101947 0.9859469 0.9151943 0.8806890 0.7940317
## [8] 0.4311618
##
## Rotation (n x k) = (8 x 8):
## PC1 PC2 PC3 PC4 PC5
## age 0.3492968 0.47241405 0.16433154 -0.2421434477 -0.06918236
## cigsPerDay -0.1314881 -0.62969887 0.08825707 -0.3658378145 -0.60887479
## totChol 0.2702918 0.17231700 0.31926829 -0.6748575961 -0.02842964
## sysBP 0.5643905 -0.10066205 -0.13841875 0.0181053343 -0.04128465
## diaBP 0.5313974 -0.23103615 -0.27299890 0.0659091649 -0.03355063
## BMI 0.3603088 -0.08702068 -0.27315673 0.2528254651 -0.18088792
## heartRate 0.1813667 -0.52347483 0.44007417 0.0002491086 0.65752329
## glucose 0.1549480 0.07417607 0.70802936 0.5324508148 -0.39470198
## PC6 PC7 PC8
## age -0.323117191 -0.6552449 0.175047380
## cigsPerDay -0.129075713 -0.2391096 -0.001068176
## totChol 0.437640985 0.3841038 -0.003260193
## sysBP -0.331150765 0.1630321 -0.716866991
## diaBP -0.202514158 0.3063715 0.670347361
## BMI 0.731464834 -0.3905800 -0.061283762
## heartRate 0.040333090 -0.2552839 0.016773361
## glucose 0.008045824 0.1666717 0.045229773
\(\bf{Interpretation}\) from Rotation matrix (Loading matrix):
For example, PC1 increases when age, totChol, sysBP, BMI, etc., are increased and it is positively correlated whereas PC1 increases cigsPerDay decrease because these values are negatively correlated.
summary.pc = summary(pc)
summary.pc
## Importance of components:
## PC1 PC2 PC3 PC4 PC5 PC6 PC7
## Standard deviation 1.554 1.0779 1.0102 0.9859 0.9152 0.88069 0.79403
## Proportion of Variance 0.302 0.1452 0.1276 0.1215 0.1047 0.09695 0.07881
## Cumulative Proportion 0.302 0.4472 0.5748 0.6963 0.8010 0.89795 0.97676
## PC8
## Standard deviation 0.43116
## Proportion of Variance 0.02324
## Cumulative Proportion 1.00000
\(\bf{Interpretation:}\) First five principal components explain the variability around 80.10% and they capture the majority of the variability.
#Scree plot
# Extract the eigenvalues from the PCA object
var.prop = summary.pc$importance[2,]
plot(var.prop, type = "b",
xlab = "Principal Component",
ylab = "Variance explained")
# Add a line at y = 1 to indicate the elbow
abline(v = 2, col = "red")
pairs.panels(pc$x,
gap=0,
bg = c("red", "blue")[framingham$TenYearCHD],
pch=21)
#Bi-plot
library(devtools)
## Loading required package: usethis
devtools::install_github("vqv/ggbiplot")
## Skipping install of 'ggbiplot' from a github remote, the SHA1 (f7ea76da) has not changed since last install.
## Use `force = TRUE` to force installation
library(ggbiplot)
## Loading required package: plyr
##
## Attaching package: 'plyr'
## The following object is masked from 'package:ggpubr':
##
## mutate
## Loading required package: scales
##
## Attaching package: 'scales'
## The following objects are masked from 'package:psych':
##
## alpha, rescale
## The following object is masked from 'package:readr':
##
## col_factor
## Loading required package: grid
g <- ggbiplot(pc,
choices = c(1,2),
obs.scale = 1,
var.scale = 1,
groups = framingham$TenYearCHD,
ellipse = TRUE,
ellipse.prob = 0.68)
g <- g + scale_color_discrete(name = '')
g <- g + theme(legend.direction = 'horizontal',
legend.position = 'top')
print(g)
\(\bf{Interpret:}\)
PC1 explains about 30.2% and PC2 explained about 14.5% of variability.
Arrows are closer to each other indicates the high correlation.
For example correlation between cigsPerDay vs age is weakly correlated.
Another way interpreting the plot is PC1 is positively correlated with the variables HeartRate, diaBP, BMI, sysBP, glucose, totChol, age, and PC1 is negatively correlated with cigsPerDay; PC2 is highly negatively correlated with cigsPerDay, heartRate, etc.
Bi plot is an important tool in PCA to understand what is going on in the dataset.
\(\bf{Exercise 2:}\) If focus on male only, Set the “currentSmoker” as dependent/response/Y variable. Set “age”, “cigsPerDay”, “education”, “totChol”, “sysBP”, “BMI”, “heartRate”, “glucose” as independent/exploratory/X variables, how many PCs could explain the variability above 60%? Can we separate two group of observations using first two PCs?
framingham.male = framingham[which(framingham$male==1),]
framingham.male$currentSmoker = factor(framingham.male$currentSmoker)
pc <- prcomp(framingham.male[,c("age","cigsPerDay","education","totChol","sysBP","BMI","heartRate","glucose")],
center = TRUE, #variables should be shifted to be zero centered
scale = TRUE) #the variables should be scaled to have unit variance before the analysis takes place.
summary(pc)
## Importance of components:
## PC1 PC2 PC3 PC4 PC5 PC6 PC7
## Standard deviation 1.2582 1.1232 1.0367 0.9936 0.9637 0.9219 0.84609
## Proportion of Variance 0.1979 0.1577 0.1343 0.1234 0.1161 0.1062 0.08948
## Cumulative Proportion 0.1979 0.3556 0.4899 0.6133 0.7294 0.8357 0.92514
## PC8
## Standard deviation 0.77390
## Proportion of Variance 0.07486
## Cumulative Proportion 1.00000
g <- ggbiplot(pc,
choices = c(2,3),
obs.scale = 1,
var.scale = 1,
groups = framingham.male$currentSmoker,
ellipse = TRUE,
ellipse.prob = 0.68)
g <- g + scale_color_discrete(name = '')
g <- g + theme(legend.direction = 'horizontal',
legend.position = 'top')
print(g)
\(\bf{Null \space hypothesis:}\) the mean BMI is equal to 25.
\(\bf{Alternative \space hypothesis:}\) the mean BMI is not equal to 25.
Assumption: The values in each sample are normally distributed
#One-sample t-test
t.test(framingham$BMI,alternative = "two.sided", mu=25, conf.level = 0.95)
##
## One Sample t-test
##
## data: framingham$BMI
## t = 11.645, df = 3657, p-value < 2.2e-16
## alternative hypothesis: true mean is not equal to 25
## 95 percent confidence interval:
## 25.65101 25.91460
## sample estimates:
## mean of x
## 25.7828
#check normality
shapiro.test(framingham$BMI) # => p < 0.05 => the distribution of the data is not normal
##
## Shapiro-Wilk normality test
##
## data: framingham$BMI
## W = 0.95655, p-value < 2.2e-16
qqnorm(framingham$BMI, pch = 1, frame = FALSE)
qqline(framingham$BMI, col = "steelblue", lwd = 2)
#Alternative non-parametric test: Wilcoxon Rank Sum test
wilcox.test(framingham$BMI,alternative = "two.sided", mu=25, conf.level = 0.95)
##
## Wilcoxon signed rank test with continuity correction
##
## data: framingham$BMI
## V = 3888730, p-value < 2.2e-16
## alternative hypothesis: true location is not equal to 25
\(\bf{Null \space hypothesis:}\) Female’s heart rate is less than or equal to male’s average heart rate
\(\bf{Alternative \space hypothesis:}\) Female’s heart rate is greater than male’s average heart rate
Assumption:
Normality: The values in each sample are normally distributed
Equal variances: The variances of the two samples are equal
#Two-sample t-test
male = framingham$heartRate[framingham$male==1]
female = framingham$heartRate[framingham$male!=1]
#assume normal distribution and equal variance between two groups
t.test(female,male,alternative = "greater",conf.level = 0.95,var.equal=TRUE)
##
## Two Sample t-test
##
## data: female and male
## t = 7.0175, df = 3656, p-value = 1.34e-12
## alternative hypothesis: true difference in means is greater than 0
## 95 percent confidence interval:
## 2.128147 Inf
## sample estimates:
## mean of x mean of y
## 76.96413 74.18423
# p < 0.05 => Female's heart rate is significantly greater than male's average heart rate
#check normality
shapiro.test(male) # => p < 0.05 => the distribution of the data is not normal
##
## Shapiro-Wilk normality test
##
## data: male
## W = 0.98056, p-value = 5.045e-14
shapiro.test(female) # => p < 0.05 => the distribution of the data is not normal
##
## Shapiro-Wilk normality test
##
## data: female
## W = 0.96638, p-value < 2.2e-16
qqnorm(male, pch = 1, frame = FALSE)
qqline(male, col = "steelblue", lwd = 2)
qqnorm(female, pch = 1, frame = FALSE)
qqline(female, col = "steelblue", lwd = 2)
#Homogeneity of Variance Test (normal distribution)
#Null hypothesis: equal variance of heart rate between male and female
framingham$male = factor(framingham$male)
bartlett.test(heartRate ~ male,data=framingham)
##
## Bartlett test of homogeneity of variances
##
## data: heartRate by male
## Bartlett's K-squared = 3.1269, df = 1, p-value = 0.07701
# => p = 0.0770 => equal variance of heart rate between male and female
#Homogeneity of Variance Test (less sensitive to the normal distribution)
#Null hypothesis: equal variance of heart rate between male and female
library(car)
## Loading required package: carData
##
## Attaching package: 'car'
## The following object is masked from 'package:psych':
##
## logit
leveneTest(heartRate ~ male,data=framingham) # => p = 0.6932 => equal variance of heart rate between male and female
## Levene's Test for Homogeneity of Variance (center = median)
## Df F value Pr(>F)
## group 1 0.1556 0.6932
## 3656
#Alternative non-parametric test: Wilcoxon Rank Sum test
wilcox.test(female, male, alternative = "greater", conf.level = 0.95)
##
## Wilcoxon rank sum test with continuity correction
##
## data: female and male
## W = 1862764, p-value = 1.231e-11
## alternative hypothesis: true location shift is greater than 0
# p = 1.231e-11 => Female's heart rate is greater than male's average heart rate
\(\bf{Exercise 3:}\) Does current smoker have less glucose or not?
Null: Current smoker’s glucose is greater than or equal to non current smoker’s average glucose
Alternative: Current smoker’s glucose is less than non current smoker’s average glucose
smoke <- framingham$glucose[framingham$currentSmoker==1]
nonsmoke <- framingham$glucose[framingham$currentSmoker==0]
#check normality
shapiro.test(smoke) # => p < 0.05 => the distribution of the data is not normal
##
## Shapiro-Wilk normality test
##
## data: smoke
## W = 0.59582, p-value < 2.2e-16
shapiro.test(nonsmoke) # => p < 0.05 => the distribution of the data is not normal
##
## Shapiro-Wilk normality test
##
## data: nonsmoke
## W = 0.53639, p-value < 2.2e-16
qqnorm(smoke, pch = 1, frame = FALSE)
qqline(smoke, col = "steelblue", lwd = 2)
qqnorm(nonsmoke, pch = 1, frame = FALSE)
qqline(nonsmoke, col = "steelblue", lwd = 2)
#Alternative non-parametric test: Wilcoxon Rank Sum test
wilcox.test(smoke, nonsmoke, alternative = "less", conf.level = 0.95)
##
## Wilcoxon rank sum test with continuity correction
##
## data: smoke and nonsmoke
## W = 1522882, p-value = 1.532e-06
## alternative hypothesis: true location shift is less than 0
#p < 0.05 => Current smoker's glucose is significantly less than non current smoker's average glucose
Data: Each subject is measured twice before and after the treatment.
\(\bf{Null \space hypothesis:}\) there is no improvement in weight before and after the treatment.
\(\bf{Alternative \space hypothesis:}\) Treatment significantly improved the participants’ weight.
#Paired t-test
before <- c(12.2, 14.6, 13.4, 11.2, 12.7, 10.4, 15.8, 13.9, 9.5, 14.2)
after <- c(13.5, 15.2, 13.6, 12.8, 13.7, 11.3, 16.5, 13.4, 8.7, 14.6)
data <- data.frame(subject = rep(c(1:10), 2),
time = as.factor(rep(c("before", "after"), each = 10)),
score = c(before, after))
str(data)
## 'data.frame': 20 obs. of 3 variables:
## $ subject: int 1 2 3 4 5 6 7 8 9 10 ...
## $ time : Factor w/ 2 levels "after","before": 2 2 2 2 2 2 2 2 2 2 ...
## $ score : num 12.2 14.6 13.4 11.2 12.7 10.4 15.8 13.9 9.5 14.2 ...
#check normality
shapiro.test(data$score[data$time=="before"])
##
## Shapiro-Wilk normality test
##
## data: data$score[data$time == "before"]
## W = 0.97673, p-value = 0.9453
# => p =0.9453 => the distribution of the data are not significantly different from normal distribution
shapiro.test(data$score[data$time=="after"])
##
## Shapiro-Wilk normality test
##
## data: data$score[data$time == "after"]
## W = 0.92747, p-value = 0.4234
# => p=0.4234 => the distribution of the data are not significantly different from normal distribution
qqnorm(data$score[data$time=="before"], pch = 1, frame = FALSE)
qqline(data$score[data$time=="before"], col = "steelblue", lwd = 2)
qqnorm(data$score[data$time=="after"], pch = 1, frame = FALSE)
qqline(data$score[data$time=="after"], col = "steelblue", lwd = 2)
#Homogeneity of Variance Test (normal distribution)
bartlett.test(score ~ time,data=data)
##
## Bartlett test of homogeneity of variances
##
## data: score by time
## Bartlett's K-squared = 0.050609, df = 1, p-value = 0.822
# => p =0.822 => equal variance of knowledge score between before and after group
#test
t.test(formula = score ~ time,
data = data,
alternative = "greater",
paired = TRUE,
var.equal = TRUE,
conf.level = 0.95)
##
## Paired t-test
##
## data: score by time
## t = 2.272, df = 9, p-value = 0.0246
## alternative hypothesis: true mean difference is greater than 0
## 95 percent confidence interval:
## 0.1043169 Inf
## sample estimates:
## mean difference
## 0.54
# => p=0.0246 < 0.05
#Treatment significantly increases the participants’ weight.
#equivalent to one sample ttest for diff = after-before
t.test(after-before,
alternative = "greater",
mu = 0,
conf.level = 0.95)
##
## One Sample t-test
##
## data: after - before
## t = 2.272, df = 9, p-value = 0.0246
## alternative hypothesis: true mean is greater than 0
## 95 percent confidence interval:
## 0.1043169 Inf
## sample estimates:
## mean of x
## 0.54
\(\bf{Null \space hypothesis:}\) the means of BMI from all education levels are identical.
\(\bf{Alternative \space hypothesis:}\) There is at least one education level’s average BMI different from others.
#one-way ANOVA
library(dplyr)
##
## Attaching package: 'dplyr'
## The following object is masked from 'package:car':
##
## recode
## The following objects are masked from 'package:plyr':
##
## arrange, count, desc, failwith, id, mutate, rename, summarise,
## summarize
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
#Calculate summary statistics
framingham$education=factor(framingham$education)
group_by(framingham, education) %>%
dplyr::summarise(
count = n(),
mean = mean(BMI, na.rm = TRUE),
sd = sd(BMI, na.rm = TRUE)
)
## # A tibble: 4 × 4
## education count mean sd
## <fct> <int> <dbl> <dbl>
## 1 1 1526 26.6 4.47
## 2 2 1101 25.3 3.83
## 3 3 608 25.1 3.40
## 4 4 423 25.3 3.50
library("ggpubr")
ggline(framingham, x = "education", y = "BMI",
add = c("mean_se"),color = "red",
add.params = list(color = "black"),
ylab = "BMI", xlab = "education")
#assume normality and homogeneity of variance
res.aov <- aov(BMI ~ education, data = framingham)
summary(res.aov)
## Df Sum Sq Mean Sq F value Pr(>F)
## education 3 1675 558.5 34.72 <2e-16 ***
## Residuals 3654 58771 16.1
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# => p < 2e-16 => at least one group that BMI is significantly different from others
#Tukey HSD (Tukey Honest Significant Differences)
TukeyHSD(res.aov)
## Tukey multiple comparisons of means
## 95% family-wise confidence level
##
## Fit: aov(formula = BMI ~ education, data = framingham)
##
## $education
## diff lwr upr p adj
## 2-1 -1.30542859 -1.7130222 -0.8978350 0.0000000
## 3-1 -1.50467905 -1.9990312 -1.0103269 0.0000000
## 4-1 -1.32308630 -1.8894909 -0.7566817 0.0000000
## 3-2 -0.19925046 -0.7200779 0.3215770 0.7590166
## 4-2 -0.01765771 -0.6073112 0.5719958 0.9998366
## 4-3 0.18159275 -0.4710502 0.8342357 0.8911233
#the average BMI from education level one is significantly greater than the average BMI from all other three levels.
# check normal distribution
shapiro.test(framingham$BMI[framingham$education==1])
##
## Shapiro-Wilk normality test
##
## data: framingham$BMI[framingham$education == 1]
## W = 0.94629, p-value < 2.2e-16
shapiro.test(framingham$BMI[framingham$education==2])
##
## Shapiro-Wilk normality test
##
## data: framingham$BMI[framingham$education == 2]
## W = 0.97414, p-value = 4.076e-13
shapiro.test(framingham$BMI[framingham$education==3])
##
## Shapiro-Wilk normality test
##
## data: framingham$BMI[framingham$education == 3]
## W = 0.98283, p-value = 1.431e-06
shapiro.test(framingham$BMI[framingham$education==4])
##
## Shapiro-Wilk normality test
##
## data: framingham$BMI[framingham$education == 4]
## W = 0.96915, p-value = 8.79e-08
# => not normal distribution
qqnorm(framingham$BMI[framingham$education==1], pch = 1, frame = FALSE)
qqline(framingham$BMI[framingham$education==1], col = "steelblue", lwd = 2)
qqnorm(framingham$BMI[framingham$education==2], pch = 1, frame = FALSE)
qqline(framingham$BMI[framingham$education==2], col = "steelblue", lwd = 2)
qqnorm(framingham$BMI[framingham$education==3], pch = 1, frame = FALSE)
qqline(framingham$BMI[framingham$education==3], col = "steelblue", lwd = 2)
qqnorm(framingham$BMI[framingham$education==4], pch = 1, frame = FALSE)
qqline(framingham$BMI[framingham$education==4], col = "steelblue", lwd = 2)
#Homogeneity of Variance Test
bartlett.test(BMI ~ education, data = framingham)
##
## Bartlett test of homogeneity of variances
##
## data: BMI by education
## Bartlett's K-squared = 86.68, df = 3, p-value < 2.2e-16
# => p < 0.05 # different variance
#ANOVA test with no equal variance assumption, but normal distribution
oneway.test(BMI ~ education, data = framingham)
##
## One-way analysis of means (not assuming equal variances)
##
## data: BMI and education
## F = 33.084, num df = 3, denom df = 1443, p-value < 2.2e-16
# => p-value < 2.2e-16 => at least one group are different from others
#Pairwise t-tests with no assumption of equal variances
pairwise.t.test(framingham$BMI, framingham$education,
p.adjust.method = "BH", pool.sd = FALSE)
##
## Pairwise comparisons using t tests with non-pooled SD
##
## data: framingham$BMI and framingham$education
##
## 1 2 3
## 2 4.2e-15 - -
## 3 6.7e-16 0.40 -
## 4 3.7e-10 0.93 0.49
##
## P value adjustment method: BH
#the average BMI from education level one is significantly different from the average BMI from all other three levels.
#ANOVA test with no normal distribution
#The Kruskal-Wallis rank-sum test is a non-parametric alternative to one-way ANOVA that can be employed when the ANOVA assumptions are not met.
kruskal.test(BMI ~ education, data = framingham)
##
## Kruskal-Wallis rank sum test
##
## data: BMI by education
## Kruskal-Wallis chi-squared = 93.851, df = 3, p-value < 2.2e-16
pairwise.wilcox.test(framingham$BMI, framingham$education, p.adj = "bonf")
##
## Pairwise comparisons using Wilcoxon rank sum test with continuity correction
##
## data: framingham$BMI and framingham$education
##
## 1 2 3
## 2 2.8e-14 - -
## 3 6.7e-13 1 -
## 4 8.9e-08 1 1
##
## P value adjustment method: bonferroni
Two-way ANOVA test is used to evaluate simultaneously the effect of two grouping variables (A and B) on a response variable.
The grouping variables are also known as factors. The different categories (groups) of a factor are called levels. The number of levels can vary between factors. The level combinations of factors are called cell.
Two-way ANOVA test hypotheses:
1: There is no difference in means of factor A
2: There is no difference in means of factor B
3: There is no interaction between factors A and B
The alternative hypothesis for cases 1 and 2 is: the means are not equal.
The alternative hypothesis for case 3 is: there is an interaction between A and B.
Assumes that the observations within each cell are normally distributed and have equal variances.
Let’s focuse on “education”,“age” and “currentSmoker”
my_data <- framingham[c("education","age","currentSmoker")]
my_data$currentSmoker <- ifelse(my_data$currentSmoker==1,"smoke","nonsmoke")
my_data$education = factor(my_data$education)
my_data$currentSmoker = factor(my_data$currentSmoker)
str(my_data)
## tibble [3,658 × 3] (S3: tbl_df/tbl/data.frame)
## $ education : Factor w/ 4 levels "1","2","3","4": 4 2 1 3 3 2 1 2 1 1 ...
## $ age : num [1:3658] 39 46 48 61 46 43 63 45 52 43 ...
## $ currentSmoker: Factor w/ 2 levels "nonsmoke","smoke": 1 1 2 2 2 1 1 2 1 2 ...
## - attr(*, "na.action")= 'omit' Named int [1:582] 15 22 27 34 37 43 50 55 71 73 ...
## ..- attr(*, "names")= chr [1:582] "15" "22" "27" "34" ...
table(my_data$currentSmoker,my_data$education)
##
## 1 2 3 4
## nonsmoke 824 513 324 208
## smoke 702 588 284 215
group_by(my_data, education,currentSmoker) %>%
dplyr::summarise(
count = n(),
mean = mean(age, na.rm = TRUE),
sd = sd(age, na.rm = TRUE)
)
## `summarise()` has grouped output by 'education'. You can override using the
## `.groups` argument.
## # A tibble: 8 × 5
## # Groups: education [4]
## education currentSmoker count mean sd
## <fct> <fct> <int> <dbl> <dbl>
## 1 1 nonsmoke 824 53.7 8.18
## 2 1 smoke 702 49.7 8.17
## 3 2 nonsmoke 513 48.9 8.79
## 4 2 smoke 588 45.8 7.56
## 5 3 nonsmoke 324 50.5 8.43
## 6 3 smoke 284 47.1 7.41
## 7 4 nonsmoke 208 49.2 8.60
## 8 4 smoke 215 47.2 7.86
library(ggplot2)
library(ggsci)
ggboxplot(my_data, x = "education", y = "age", color = "currentSmoker") +
theme_minimal() +
scale_fill_simpsons() + #nejm
scale_color_simpsons()
ggline(my_data, x = "education", y = "age", color = "currentSmoker",
add = c("mean_se", "violin"))+
theme_minimal() +
scale_fill_simpsons() + #nejm
scale_color_simpsons()
res.aov2 <- aov(age ~ education + currentSmoker, data = my_data)
summary(res.aov2)
## Df Sum Sq Mean Sq F value Pr(>F)
## education 3 15208 5069 76.37 <2e-16 ***
## currentSmoker 1 10416 10416 156.93 <2e-16 ***
## Residuals 3653 242465 66
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
From the ANOVA table we can conclude that age in each level of education and currentSmoke are statistically significantly difference.
# Two-way ANOVA with interaction effect
# These two calls are equivalent
res.aov3 <- aov(age ~ education * currentSmoker, data = my_data)
res.aov3 <- aov(age ~ education + currentSmoker + education:currentSmoker, data = my_data)
summary(res.aov3)
## Df Sum Sq Mean Sq F value Pr(>F)
## education 3 15208 5069 76.427 <2e-16 ***
## currentSmoker 1 10416 10416 157.037 <2e-16 ***
## education:currentSmoker 3 366 122 1.841 0.138
## Residuals 3650 242099 66
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
From the ANOVA results, we can conclude the following, based on the p-values and a significance level of 0.05:
*the p-value of education is < 2e-16 (significant), which indicates that the levels of education are associated with significant different age.
*the p-value of currentSmoker is < 2e-16 (significant), which indicates that the status of currentSmoker are associated with significant different age.
*the p-value for the interaction between education and currentSmoker is 0.138 (non-significant), which indicates that the relationships between levels of education and age doesn’t depend on the status of currentSmoker.
#Check the homogeneity of variance assumption
plot(res.aov3, 1)
leveneTest(age ~ education * currentSmoker, data = my_data)
## Levene's Test for Homogeneity of Variance (center = median)
## Df F value Pr(>F)
## group 7 7.2556 1.166e-08 ***
## 3650
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# 2. Normality
plot(res.aov3, 2)
The residuals versus fits plot is used to check the homogeneity of variances. In the plot above, there is no evident relationships (no pattern) between residuals and fitted values (the mean of each groups), which is good. So, we can assume the homogeneity of variances.
Notice: Points 1202 and 2321 are detected as outliers, which can severely affect normality and homogeneity of variance. It can be useful to remove outliers to meet the test assumptions.
Independence tests are used to determine if there is a significant relationship between two categorical variables. There exists two different types of independence test:
On the one hand, the Chi-square test is used when the sample is large enough (in this case thep-value is an approximation that becomes exact when the sample becomes infinite, which is the case for many statistical tests). On the other hand, the Fisher’s exact test is used when the sample is small (and in this case the p-value is exact and is not an approximation).
The literature indicates that the usual rule for deciding whether the \(\chi^2\) approximation is good enough is that the Chi-square test is not appropriate when the expected values in one of the cells of the contingency table is less than 5, and in this case the Fisher’s exact test is preferred
The hypotheses of the Fisher’s exact test are the same as for the Chi-square test:
NULL: the variables are independent, there is no relationship between the two categorical variables. Knowing the value of one variable does not help to predict the value of the other variable. Alternative: the variables are dependent, there is a relationship between the two categorical variables. Knowing the value of one variable helps to predict the value of the other variable.
table(my_data$education,my_data$currentSmoker)
##
## nonsmoke smoke
## 1 824 702
## 2 513 588
## 3 324 284
## 4 208 215
my_data2 = data.frame(table(my_data$education,my_data$currentSmoker))
colnames(my_data2) = c("education","currentSmoker","count")
my_data2$education=factor(my_data2$education)
my_data2$currentSmoker=factor(my_data2$currentSmoker)
library(ggmosaic)
ggplot(data = my_data2) +
geom_mosaic(aes(x = product(currentSmoker), fill= education, weight = count)) +
theme_mosaic() +
scale_fill_simpsons() +
scale_color_simpsons() +
scale_x_productlist(expand = c(0, 0)) + # Remove space between x-axis and plot
scale_y_continuous(expand = c(0, 0)) + # Remove space between y-axis and plot
ylab("Percentage") +
theme(
#legend.position = "bottom", # Position legend at the bottom
#legend.direction = "horizontal", # Arrange legend items in a row
#legend.box = "horizontal", # Ensure the legend box is horizontal
panel.grid.major = element_blank(), # Remove major grid lines
panel.grid.minor = element_blank(), # Remove minor grid lines
panel.border = element_blank(), # Remove all borders first
axis.text.x = element_text(angle = 15, hjust = 1) # Rotate x-axis labels
)
## Warning: `unite_()` was deprecated in tidyr 1.2.0.
## ℹ Please use `unite()` instead.
## ℹ The deprecated feature was likely used in the ggmosaic package.
## Please report the issue at <https://github.com/haleyjeppson/ggmosaic>.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
my_data3 = table(my_data$education, my_data$currentSmoker)
print(my_data3)
##
## nonsmoke smoke
## 1 824 702
## 2 513 588
## 3 324 284
## 4 208 215
chisq.test(my_data3)
##
## Pearson's Chi-squared test
##
## data: my_data3
## X-squared = 15.868, df = 3, p-value = 0.001207
fisher.test(my_data3)
##
## Fisher's Exact Test for Count Data
##
## data: my_data3
## p-value = 0.001204
## alternative hypothesis: two.sided
# p-value<0.05 => there is a significant relationship between the two categorical variables
\(\bf{Exercise 4:}\) Is there significant correlation between “education” and “TenYearCHD”?
my_data4 = table(framingham$education,framingham$TenYearCHD)
print(my_data4)
##
## 0 1
## 1 1235 291
## 2 970 131
## 3 533 75
## 4 363 60
chisq.test(my_data4)
##
## Pearson's Chi-squared test
##
## data: my_data4
## X-squared = 31.199, df = 3, p-value = 7.717e-07
fisher.test(my_data4)
##
## Fisher's Exact Test for Count Data
##
## data: my_data4
## p-value = 9.354e-07
## alternative hypothesis: two.sided
# p-value<0.05 => there is a significant relationship between the two categorical variables
\(Y = a + b*X\): Y variable (the dependent or response variable) is a function of the X variable\
\(\bf{Question:}\) For current smoker, how the sysBP change as age increase?
Y = sysBP, X = age => sysBP~age
plot(sysBP~age, data = framingham[framingham$currentSmoker==1,])
lm.simple = lm(sysBP~age, data = framingham[framingham$currentSmoker==1,]) #Create the linear regression
lm.simple #If we look at ‘lm.simple’ we don’t get much to work with, just the coefficient estimates for the intercept and slope
##
## Call:
## lm(formula = sysBP ~ age, data = framingham[framingham$currentSmoker ==
## 1, ])
##
## Coefficients:
## (Intercept) age
## 84.6990 0.9356
summary(lm.simple) #Review the results
##
## Call:
## lm(formula = sysBP ~ age, data = framingham[framingham$currentSmoker ==
## 1, ])
##
## Residuals:
## Min 1Q Median 3Q Max
## -55.466 -12.753 -2.224 9.390 94.776
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 84.69901 2.72582 31.07 <2e-16 ***
## age 0.93564 0.05635 16.60 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 19.05 on 1787 degrees of freedom
## Multiple R-squared: 0.1337, Adjusted R-squared: 0.1332
## F-statistic: 275.7 on 1 and 1787 DF, p-value: < 2.2e-16
#Intepret:
#1. the p-value for the slope (age) coefficient is less than 0.05, allowing you to say that the association of sysBP and age is statistically significant.
#2. sysBP = 84.699 + 0.936 * age
#3. R-squared: proportion of variance in the data explained by the model
#4. F-statistic and p-value: test whether all coefficients in the model are zero.
# Plot the scatterplot as before:
plot(sysBP~age, data = framingham[framingham$currentSmoker==1,])
# And then plot the fitted line:
abline(lm.simple,col="red")
\(Y = a + b1*X1 + b2*X2 + b3*X3...\): Y variable (the dependent or response variable) is a function of multiple X variables.\
Y = glucose, X = all other variables in dataset => glucose ~.
\(\bf{Equation:}\) Glucose = 61.8266 + 0.8771 * 1(male==1) + 0.0420 * age + 0.4276 * 1(education=2) + 1.5228 * 1(education=3) + …
lm.multiple = lm(glucose~., data = framingham) #Create the linear regression
summary(lm.multiple) #Review the results
##
## Call:
## lm(formula = glucose ~ ., data = framingham)
##
## Residuals:
## Min 1Q Median 3Q Max
## -125.744 -8.483 -1.512 6.806 222.378
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 61.826614 4.192430 14.747 < 2e-16 ***
## male1 0.877188 0.689320 1.273 0.203262
## age 0.042007 0.043110 0.974 0.329914
## education2 0.427602 0.765963 0.558 0.576705
## education3 1.522783 0.914165 1.666 0.095846 .
## education4 0.040060 1.049186 0.038 0.969545
## currentSmoker 0.088767 0.993729 0.089 0.928827
## cigsPerDay -0.072143 0.042863 -1.683 0.092444 .
## BPMeds 0.838561 1.899117 0.442 0.658839
## prevalentStroke 1.444148 4.126220 0.350 0.726364
## prevalentHyp -1.294709 0.955203 -1.355 0.175365
## diabetes 87.902453 1.932748 45.481 < 2e-16 ***
## totChol -0.001891 0.007405 -0.255 0.798466
## sysBP 0.116131 0.027171 4.274 1.97e-05 ***
## diaBP -0.130649 0.044650 -2.926 0.003454 **
## BMI 0.089366 0.084830 1.053 0.292195
## heartRate 0.116824 0.026915 4.341 1.46e-05 ***
## TenYearCHD1 3.446662 0.905364 3.807 0.000143 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 18.68 on 3640 degrees of freedom
## Multiple R-squared: 0.392, Adjusted R-squared: 0.3892
## F-statistic: 138.1 on 17 and 3640 DF, p-value: < 2.2e-16
# => totChol does not significant affect the regression model.
#Detect Influential Points
cookdis = cooks.distance(lm.multiple)
plot(cooks.distance(lm.multiple), pch = 16, col = "blue") #Plot the Cooks Distances.
potential.out = which(cookdis > 0.05)
#1. Someone made a recording error
#2. Someone made a fundamental mistake collecting the observation
#3. The data point is perfectly valid, in which case the model cannot account for the behavior.
lm.multiple = lm(glucose~., data = framingham[-potential.out,]) #Create the linear regression
summary(lm.multiple) #Review the results
##
## Call:
## lm(formula = glucose ~ ., data = framingham[-potential.out, ])
##
## Residuals:
## Min 1Q Median 3Q Max
## -106.050 -8.362 -1.642 6.672 145.720
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 63.222322 3.570447 17.707 < 2e-16 ***
## male1 0.790449 0.586546 1.348 0.17786
## age 0.035846 0.036664 0.978 0.32830
## education2 -0.120172 0.651696 -0.184 0.85371
## education3 0.427098 0.778895 0.548 0.58349
## education4 0.026162 0.891675 0.029 0.97660
## currentSmoker 0.035369 0.845873 0.042 0.96665
## cigsPerDay -0.079897 0.036482 -2.190 0.02859 *
## BPMeds -1.953630 1.624736 -1.202 0.22928
## prevalentStroke 3.022988 3.506438 0.862 0.38868
## prevalentHyp -0.723313 0.812928 -0.890 0.37365
## diabetes 69.374331 1.716169 40.424 < 2e-16 ***
## totChol -0.003824 0.006308 -0.606 0.54446
## sysBP 0.091420 0.023177 3.944 8.15e-05 ***
## diaBP -0.099003 0.038030 -2.603 0.00927 **
## BMI 0.155231 0.072261 2.148 0.03176 *
## heartRate 0.101699 0.022900 4.441 9.22e-06 ***
## TenYearCHD1 2.355180 0.771469 3.053 0.00228 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 15.87 on 3631 degrees of freedom
## Multiple R-squared: 0.3401, Adjusted R-squared: 0.337
## F-statistic: 110.1 on 17 and 3631 DF, p-value: < 2.2e-16
#If the case is 1 or 2, then you can remove the point (or correct it). If it's 3, it's not worth deleting a valid point; maybe you can try on a non-linear model rather than a linear model like linear regression.
Logistic regression is a type of regression analysis in statistics used for prediction of outcome of a categorical dependent variable from a set of predictor or independent variables. In logistic regression the dependent variable is always binary. Logistic regression is mainly used to for prediction and also calculating the probability of success.
Logistic regression equation: \(P = e^{\beta_0+\beta_1 X_1 + \beta_2 X_2}/1+e^{\beta_0+\beta_1 X_1 + \beta_2 X_2}\)
In this dataset, we can generate logistic regression as:
\(logit(p)=log(p/(1−p))=\beta_0+\beta_1∗1_{male}+\beta_2∗age+\beta_3∗cigsPerDay+\beta_4∗totChol+\beta_5∗sysBP+\beta_6∗glucose\)
my_data = framingham[,c(1,2,5,10,11,15,16)]
logit.model <- glm(TenYearCHD ~.,family=binomial(link='logit'),data=my_data)
summary(logit.model)
##
## Call:
## glm(formula = TenYearCHD ~ ., family = binomial(link = "logit"),
## data = my_data)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -9.135256 0.475483 -19.213 < 2e-16 ***
## male1 0.561663 0.106823 5.258 1.46e-07 ***
## age 0.065966 0.006425 10.267 < 2e-16 ***
## cigsPerDay 0.019227 0.004175 4.606 4.11e-06 ***
## totChol 0.002280 0.001123 2.031 0.0422 *
## sysBP 0.017527 0.002149 8.155 3.49e-16 ***
## glucose 0.007283 0.001677 4.343 1.40e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 3121.2 on 3657 degrees of freedom
## Residual deviance: 2762.8 on 3651 degrees of freedom
## AIC: 2776.8
##
## Number of Fisher Scoring iterations: 5
\(\bf{Interpretation:}\)
This fitted model shows that, holding all other features constant, the odds of getting diagnosed with heart disease for males (male = 1)over that of females (male = 0) is \(e^{0.5617} = 1.7537\). In terms of percent change, we can say that the odds for males are 75.37% higher than the odds for females.
The coefficient for age says that, holding all others constant, we will see 7% increase in the odds of getting diagnosed with CDH for a one year increase in age since \(e^{0.0660} = 1.0682\).
Similarly , with every extra cigarette one smokes there is a 2% increase in the odds of CDH.
There is a 1.7% increase in odds for every unit increase in systolic Blood Pressure.
logit_P = predict(logit.model, newdata = my_data[,-7],type = 'response' )
pred.data <- ifelse(logit_P > 0.5,1,0) # Probability check
library(caret)
## Loading required package: lattice
CM= confusionMatrix(as.factor(pred.data),positive = "1", reference = my_data$TenYearCHD)
print(CM)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 3078 515
## 1 23 42
##
## Accuracy : 0.8529
## 95% CI : (0.841, 0.8643)
## No Information Rate : 0.8477
## P-Value [Acc > NIR] : 0.1977
##
## Kappa : 0.1066
##
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 0.07540
## Specificity : 0.99258
## Pos Pred Value : 0.64615
## Neg Pred Value : 0.85667
## Prevalence : 0.15227
## Detection Rate : 0.01148
## Detection Prevalence : 0.01777
## Balanced Accuracy : 0.53399
##
## 'Positive' Class : 1
##
pred.data.f = as.factor(pred.data)
#ROC-curve using pROC library
library(pROC)
## Type 'citation("pROC")' for a citation.
##
## Attaching package: 'pROC'
## The following objects are masked from 'package:stats':
##
## cov, smooth, var
roc_score=roc(my_data$TenYearCHD, logit_P) #AUC score
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
plot(roc_score ,main ="ROC curve -- Logistic Regression ")
pROC::roc(my_data$TenYearCHD, logit_P,plot=TRUE, print.auc=TRUE, show.thres=TRUE)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
##
## Call:
## roc.default(response = my_data$TenYearCHD, predictor = logit_P, plot = TRUE, print.auc = TRUE, show.thres = TRUE)
##
## Data: logit_P in 3101 controls (my_data$TenYearCHD 0) < 557 cases (my_data$TenYearCHD 1).
## Area under the curve: 0.7362
\(\bf{Exercise 5:}\) Using the same exploratory variables as above, compare the logistic regression model male only and the logistic regression model for female only, which one performs better in terms of AUC?
my_data.male = framingham[which(framingham$male==1),c(2,5,10,11,15,16)]
logit.model.male <- glm(TenYearCHD ~.,family=binomial(link='logit'),data=my_data.male)
summary(logit.model.male)
##
## Call:
## glm(formula = TenYearCHD ~ ., family = binomial(link = "logit"),
## data = my_data.male)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -9.496940 0.721291 -13.167 < 2e-16 ***
## age 0.066431 0.008504 7.812 5.64e-15 ***
## cigsPerDay 0.017426 0.004879 3.571 0.000355 ***
## totChol 0.004035 0.001626 2.482 0.013064 *
## sysBP 0.020904 0.003265 6.402 1.53e-10 ***
## glucose 0.007826 0.002414 3.242 0.001187 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1574.3 on 1622 degrees of freedom
## Residual deviance: 1405.0 on 1617 degrees of freedom
## AIC: 1417
##
## Number of Fisher Scoring iterations: 5
logit_P.male = predict(logit.model.male, newdata = my_data.male[,-6],type = 'response' )
pred.data.male <- ifelse(logit_P.male > 0.5,1,0) # Probability check
library(caret)
CM.male= confusionMatrix(as.factor(pred.data.male),positive = "1", reference = my_data.male$TenYearCHD)
print(CM.male)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 1296 273
## 1 20 34
##
## Accuracy : 0.8195
## 95% CI : (0.7999, 0.8379)
## No Information Rate : 0.8108
## P-Value [Acc > NIR] : 0.1966
##
## Kappa : 0.1397
##
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 0.11075
## Specificity : 0.98480
## Pos Pred Value : 0.62963
## Neg Pred Value : 0.82600
## Prevalence : 0.18916
## Detection Rate : 0.02095
## Detection Prevalence : 0.03327
## Balanced Accuracy : 0.54778
##
## 'Positive' Class : 1
##
#ROC-curve using pROC library
library(pROC)
pROC::roc(my_data.male$TenYearCHD, logit_P.male,plot=TRUE, print.auc=TRUE, show.thres=TRUE)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
##
## Call:
## roc.default(response = my_data.male$TenYearCHD, predictor = logit_P.male, plot = TRUE, print.auc = TRUE, show.thres = TRUE)
##
## Data: logit_P.male in 1316 controls (my_data.male$TenYearCHD 0) < 307 cases (my_data.male$TenYearCHD 1).
## Area under the curve: 0.7266
my_data.female = framingham[which(framingham$male!=1),c(2,5,10,11,15,16)]
logit.model.female <- glm(TenYearCHD ~.,family=binomial(link='logit'),data=my_data.female)
summary(logit.model.female)
##
## Call:
## glm(formula = TenYearCHD ~ ., family = binomial(link = "logit"),
## data = my_data.female)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -8.6286022 0.6255214 -13.794 < 2e-16 ***
## age 0.0703954 0.0101780 6.916 4.63e-12 ***
## cigsPerDay 0.0244242 0.0081248 3.006 0.00265 **
## totChol 0.0007287 0.0016166 0.451 0.65218
## sysBP 0.0151603 0.0029107 5.208 1.90e-07 ***
## glucose 0.0068029 0.0023478 2.898 0.00376 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1516.3 on 2034 degrees of freedom
## Residual deviance: 1352.6 on 2029 degrees of freedom
## AIC: 1364.6
##
## Number of Fisher Scoring iterations: 5
logit_P.female = predict(logit.model.female, newdata = my_data.female[,-6],type = 'response' )
pred.data.female <- ifelse(logit_P.female > 0.5,1,0) # Probability check
library(caret)
CM.female= confusionMatrix(as.factor(pred.data.female),positive = "1", reference = my_data.female$TenYearCHD)
print(CM.female)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 1780 242
## 1 5 8
##
## Accuracy : 0.8786
## 95% CI : (0.8636, 0.8925)
## No Information Rate : 0.8771
## P-Value [Acc > NIR] : 0.4362
##
## Kappa : 0.0493
##
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 0.032000
## Specificity : 0.997199
## Pos Pred Value : 0.615385
## Neg Pred Value : 0.880317
## Prevalence : 0.122850
## Detection Rate : 0.003931
## Detection Prevalence : 0.006388
## Balanced Accuracy : 0.514599
##
## 'Positive' Class : 1
##
#ROC-curve using pROC library
library(pROC)
pROC::roc(my_data.female$TenYearCHD, logit_P.female,plot=TRUE, print.auc=TRUE, show.thres=TRUE)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
##
## Call:
## roc.default(response = my_data.female$TenYearCHD, predictor = logit_P.female, plot = TRUE, print.auc = TRUE, show.thres = TRUE)
##
## Data: logit_P.female in 1785 controls (my_data.female$TenYearCHD 0) < 250 cases (my_data.female$TenYearCHD 1).
## Area under the curve: 0.7342